Clustering is unsupervised learning: the resulting clusters are completely derived from data distributed in given a feature set with no class available
Compared to supervised learning counterparts, it is …
Your task is to try different clustering algorithms and also a range of the potential parameter(s) which affect the number of clusters including ..
on 5K colorectal patches represented by 4 different representation PathologyGAN, ResNet50, InceptionV3 and VGG16
5,000 non-overlapping image patches from hematoxylin & eosin (H&E) stained histological images of human colorectal cancer (CRC) and normal tissue.
PCA and UMAP were employed to reduce each feature sapce into 100-dimensional vectors
9 tissue types are also available which include Adipose (ADI), background (BACK), debris (DEB), lymphocytes (LYM), mucus (MUC), smooth muscle (MUS), normal colon mucosa (NORM), cancer-associated stroma (STR), colorectal adenocarcinoma epithelium (TUM)
To assess quality of clustering solutions, several approaches are expected to be done and interpreted which include...
For more information, please have a check... https://scikit-learn.org/stable/modules/clustering.html#clustering-performance-evaluation
Report on your preprocessing pipeline, theory and intuition behinds each algorithm and representation, parameter searching and performance evaluation frameworks. If there is any addiotional process, give evidences/justifications on how it helps.
# !pip install h5py==2.10.0
# !pip install numpy
# !pip install pandas
# !pip install sklearn
# !pip install scikit-network
# !pip install pickle-mixin==1.0.2
# !pip install matplotlib
# !pip install plotly
import h5py
import numpy as np
import pickle
pge_path = 'colon_nct_feature/pge_dim_reduced_feature.h5'
resnet50_path = 'colon_nct_feature/resnet50_dim_reduced_feature.h5'
inceptionv3_path = 'colon_nct_feature/inceptionv3_dim_reduced_feature.h5'
vgg16_path = 'colon_nct_feature/vgg16_dim_reduced_feature.h5'
pge_content = h5py.File(pge_path, mode='r')
resnet50_content = h5py.File(resnet50_path, mode='r')
inceptionv3_content = h5py.File(inceptionv3_path, mode='r')
vgg16_content = h5py.File(vgg16_path, mode='r')
vgg16_content
<HDF5 file "vgg16_dim_reduced_feature.h5" (mode r)>
#PCA feature from 4 feature sets: pge_latent, resnet50_latent, inceptionv3_latent, vgg16_latent
pge_pca_feature = pge_content['pca_feature'][...]
resnet50_pca_feature = resnet50_content['pca_feature'][...]
inceptionv3_pca_feature = inceptionv3_content['pca_feature'][...]
vgg16_pca_feature = vgg16_content['pca_feature'][...]
vgg16_pca_feature
array([[ -90.864815 , 107.03243 , 116.16385 , ..., 8.259951 ,
-1.1383446, -9.175951 ],
[ -74.06226 , 117.31615 , 95.24547 , ..., -1.9739974,
6.8172655, 4.2547903],
[ -81.875336 , 122.84872 , 83.767784 , ..., -1.3287221,
13.65623 , -11.109545 ],
...,
[ 32.70835 , 31.183935 , -47.30978 , ..., 4.2004256,
3.599823 , 6.0058775],
[ -39.911785 , -105.81029 , 122.37118 , ..., -14.908715 ,
-1.1814455, -10.837135 ],
[ 32.35425 , -50.3373 , 66.50197 , ..., -3.859397 ,
-1.047523 , 2.3827858]], dtype=float32)
#UMAP feature from 4 feature sets: pge_latent, resnet50_latent, inceptionv3_latent, vgg16_latent
pge_umap_feature = pge_content['umap_feature'][...]
resnet50_umap_feature = resnet50_content['umap_feature'][...]
inceptionv3_umap_feature = inceptionv3_content['umap_feature'][...]
vgg16_umap_feature = vgg16_content['umap_feature'][...]
#tissue type as available ground-truth: labels
filename = np.squeeze(pge_content['file_name'])
filename = np.array([str(x) for x in filename])
labels = np.array([x.split('/')[2] for x in filename])
labels
array(['ADI', 'ADI', 'ADI', ..., 'TUM', 'TUM', 'TUM'], dtype='<U4')
import random
random.seed(0)
selected_index = random.sample(list(np.arange(len(pge_pca_feature))), 200)
test_data = pge_pca_feature[selected_index]
test_label = labels[selected_index]
import plotly.graph_objects as go
import pandas as pd
traces = []
for name in np.unique(labels):
trace = go.Scatter3d(
x=test_data[test_label==name,0],
y=test_data[test_label==name,1],
z=test_data[test_label==name,2],
mode='markers',
name=name,
marker=go.scatter3d.Marker(
size=4,
opacity=0.8
)
)
traces.append(trace)
data = go.Data(traces)
layout = go.Layout(
showlegend=True,
scene=go.Scene(
xaxis=go.layout.scene.XAxis(title='PC1'),
yaxis=go.layout.scene.YAxis(title='PC2'),
zaxis=go.layout.scene.ZAxis(title='PC3')
)
)
fig = go.Figure(data=data, layout=layout)
fig.update_layout(
title="First 3 pricipal components of PathologyGAN's PCA feature",
legend_title="Legend Title",
)
fig.show()
/Users/keyuan/opt/anaconda3/lib/python3.9/site-packages/plotly/graph_objs/_deprecations.py:31: DeprecationWarning: plotly.graph_objs.Data is deprecated. Please replace it with a list or tuple of instances of the following types - plotly.graph_objs.Scatter - plotly.graph_objs.Bar - plotly.graph_objs.Area - plotly.graph_objs.Histogram - etc. warnings.warn( /Users/keyuan/opt/anaconda3/lib/python3.9/site-packages/plotly/graph_objs/_deprecations.py:489: DeprecationWarning: plotly.graph_objs.Scene is deprecated. Please replace it with one of the following more specific types - plotly.graph_objs.layout.Scene warnings.warn(
from sklearn.cluster import KMeans
from sklearn.mixture import GaussianMixture
from sklearn.cluster import AgglomerativeClustering
from sknetwork.clustering import Louvain
#to create Adjacency matrix for Louvain clustering
from sklearn.metrics import pairwise_distances
from sklearn.preprocessing import MinMaxScaler
kmeans_model = KMeans(n_clusters = 3, random_state = 0) #GaussianMixture(), AgglomerativeClustering(), Louvain
kmeans_assignment = kmeans_model.fit_predict(test_data)
from scipy import sparse
louvain_model = Louvain(resolution = 0.9, modularity = 'Newman',random_state = 0)
adjacency_matrix = sparse.csr_matrix(MinMaxScaler().fit_transform(-pairwise_distances(test_data)))
louvain_assignment = louvain_model.fit_transform(adjacency_matrix)
import matplotlib.pyplot as plt
from sklearn.metrics import silhouette_score, v_measure_score
from sklearn.model_selection import KFold, train_test_split
print('Number of clusters from KMeans: %d and from Louvain: %d'%(np.unique(kmeans_assignment).shape[0],np.unique(louvain_assignment).shape[0]))
Number of clusters from KMeans: 3 and from Louvain: 3
kmeans_counts = np.unique(kmeans_assignment, return_counts = True)
louvain_counts = np.unique(louvain_assignment, return_counts = True)
print('Kmeans assignment counts')
pd.DataFrame({'Cluster Index': kmeans_counts[0], 'Number of members':kmeans_counts[1]}).set_index('Cluster Index')
Kmeans assignment counts
| Number of members | |
|---|---|
| Cluster Index | |
| 0 | 47 |
| 1 | 113 |
| 2 | 40 |
print('Louvain assignment counts')
pd.DataFrame({'Cluster Index': louvain_counts[0], 'Number of members':louvain_counts[1]}).set_index('Cluster Index')
Louvain assignment counts
| Number of members | |
|---|---|
| Cluster Index | |
| 0 | 132 |
| 1 | 67 |
| 2 | 1 |
kmeans_silhouette = silhouette_score(test_data, kmeans_assignment)
louvain_silhouette = silhouette_score(test_data, louvain_assignment)
kmeans_v_measure = v_measure_score(test_label, kmeans_assignment)
louvain_v_measure = v_measure_score(test_label, louvain_assignment)
pd.DataFrame({'Metrics': ['silhouette', 'V-measure'], 'Kmeans': [kmeans_silhouette, kmeans_v_measure], 'Louvain':[louvain_silhouette, louvain_v_measure]}).set_index('Metrics')
| Kmeans | Louvain | |
|---|---|---|
| Metrics | ||
| silhouette | 0.192233 | 0.246675 |
| V-measure | 0.268505 | 0.270140 |
def calculate_percent(sub_df, attrib):
cnt = sub_df[attrib].count()
output_sub_df = sub_df.groupby(attrib).count()
return (output_sub_df/cnt)
resulted_cluster_df = pd.DataFrame({'clusterID': kmeans_assignment, 'type': test_label})
label_proportion_df = resulted_cluster_df.groupby(['clusterID']).apply(lambda x: calculate_percent(x,'type')).rename(columns={'clusterID':'type_occurrence_percentage'}).reset_index()
pivoted_label_proportion_df = pd.pivot_table(label_proportion_df, index = 'clusterID', columns = 'type', values = 'type_occurrence_percentage')
f, axes = plt.subplots(1, 2, figsize=(20,5))
number_of_tile_df = resulted_cluster_df.groupby('clusterID')['type'].count().reset_index().rename(columns={'type':'number_of_tile'})
df_idx = pivoted_label_proportion_df.index
(pivoted_label_proportion_df*100).loc[df_idx].plot.bar(stacked=True, ax = axes[0] )
axes[0].set_ylabel('Percentage of tissue type')
axes[0].legend(loc='upper right')
axes[0].set_title('Cluster configuration by Kmeans')
resulted_cluster_df = pd.DataFrame({'clusterID': louvain_assignment, 'type': test_label})
label_proportion_df = resulted_cluster_df.groupby(['clusterID']).apply(lambda x: calculate_percent(x,'type')).rename(columns={'clusterID':'type_occurrence_percentage'}).reset_index()
pivoted_label_proportion_df = pd.pivot_table(label_proportion_df, index = 'clusterID', columns = 'type', values = 'type_occurrence_percentage')
number_of_tile_df = resulted_cluster_df.groupby('clusterID')['type'].count().reset_index().rename(columns={'type':'number_of_tile'})
df_idx = pivoted_label_proportion_df.index
(pivoted_label_proportion_df*100).loc[df_idx].plot.bar(stacked=True, ax = axes[1] )
axes[1].set_ylabel('Percentage of tissue type')
axes[1].legend(loc='upper right')
axes[1].set_title('Cluster configuration by Louvain')
f.show()
/var/folders/5m/rcxypnrn19nblt9t9403ffqw0000gn/T/ipykernel_81776/3428919821.py:27: UserWarning: Matplotlib is currently using module://matplotlib_inline.backend_inline, which is a non-GUI backend, so cannot show the figure.